1/*****************************************************************************
2 * mc.S: aarch64 motion compensation
3 *****************************************************************************
4 * Copyright (C) 2009-2014 x264 project
5 *
6 * Authors: David Conrad <lessen42@gmail.com>
7 *          Janne Grunau <janne-x264@jannau.net>
8 *          Mans Rullgard <mans@mansr.com>
9 *          Stefan Groenroos <stefan.gronroos@gmail.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24 *
25 * This program is also available under a commercial proprietary license.
26 * For more information, contact us at licensing@x264.com.
27 *****************************************************************************/
28
29#include "asm.S"
30
31// note: prefetch stuff assumes 64-byte cacheline
32
33// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
34function x264_prefetch_ref_aarch64, export=1
35    cmp         w2,  #1
36    csel        x2,  xzr, x1, eq
37    add         x0,  x0,  #64
38    add         x0,  x0,  x2,  lsl #3
39
40    lsl         x2,  x1,  #1
41    add         x3,  x1,  x1,  lsl #1
42    add         x4,  x0,  x1,  lsl #2
43
44    prfm        pldl1strm, [x0]
45    prfm        pldl1strm, [x0,  x1]
46    prfm        pldl1strm, [x0,  x2]
47    prfm        pldl1strm, [x0,  x3]
48    prfm        pldl1strm, [x4]
49    prfm        pldl1strm, [x4,  x1]
50    prfm        pldl1strm, [x4,  x2]
51    prfm        pldl1strm, [x4,  x3]
52    ret
53endfunc
54
55// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
56//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
57.macro x264_prefetch_fenc sub
58function x264_prefetch_fenc_\sub\()_aarch64, export=1
59    and         w6,  w5,  #3
60    and         w7,  w5,  #3
61    mul         x6,  x6,  x1
62    mul         x7,  x7,  x3
63    add         x0,  x0,  #64
64    add         x2,  x2,  #64
65
66    add         x0,  x0,  x6,  lsl #2
67    add         x6,  x0,  x1,  lsl #1
68    prfm        pldl1strm, [x0]
69    prfm        pldl1strm, [x0,  x1]
70    prfm        pldl1strm, [x6]
71    prfm        pldl1strm, [x6, x1]
72
73    add         x2,  x2,  x7,  lsl #1
74    prfm        pldl1strm, [x2]
75    prfm        pldl1strm, [x2,  x3]
76.ifc \sub, 422
77    add         x7,  x2,  x3,  lsl #1
78    prfm        pldl1strm, [x7]
79    prfm        pldl1strm, [x7,  x3]
80.endif
81    ret
82endfunc
83.endm
84
85x264_prefetch_fenc 420
86x264_prefetch_fenc 422
87
88// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
89//                 uint8_t *src1, intptr_t src1_stride,
90//                 uint8_t *src2, intptr_t src2_stride, int weight );
91.macro AVGH w h
92function x264_pixel_avg_\w\()x\h\()_neon, export=1
93    mov         w10, #64
94    cmp         w6,  #32
95    mov         w9, #\h
96    b.eq        pixel_avg_w\w\()_neon
97    subs        w7,  w10,  w6
98    b.lt        pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
99    cmp         w6,  #0
100    b.ge        pixel_avg_weight_w\w\()_add_add_neon
101    b           pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
102endfunc
103.endm
104
105AVGH  4, 2
106AVGH  4, 4
107AVGH  4, 8
108AVGH  4, 16
109AVGH  8, 4
110AVGH  8, 8
111AVGH  8, 16
112AVGH 16, 8
113AVGH 16, 16
114
115// 0 < weight < 64
116.macro load_weights_add_add
117    mov         w6,  w6
118.endm
119.macro weight_add_add dst, s1, s2, h=
120.ifc \h, 2
121    umull2      \dst, \s1, v30.16b
122    umlal2      \dst, \s2, v31.16b
123.else
124    umull       \dst, \s1, v30.8b
125    umlal       \dst, \s2, v31.8b
126.endif
127.endm
128
129// weight > 64
130.macro load_weights_add_sub
131    neg         w7,  w7
132.endm
133.macro weight_add_sub dst, s1, s2, h=
134.ifc \h, 2
135    umull2      \dst, \s1, v30.16b
136    umlsl2      \dst, \s2, v31.16b
137.else
138    umull       \dst, \s1, v30.8b
139    umlsl       \dst, \s2, v31.8b
140.endif
141.endm
142
143// weight < 0
144.macro load_weights_sub_add
145    neg         w6,  w6
146.endm
147.macro weight_sub_add dst, s1, s2, h=
148.ifc \h, 2
149    umull2      \dst, \s2, v31.16b
150    umlsl2      \dst, \s1, v30.16b
151.else
152    umull       \dst, \s2, v31.8b
153    umlsl       \dst, \s1, v30.8b
154.endif
155.endm
156
157.macro AVG_WEIGHT ext
158function pixel_avg_weight_w4_\ext\()_neon
159    load_weights_\ext
160    dup         v30.8b, w6
161    dup         v31.8b, w7
1621:  // height loop
163    subs        w9,  w9,  #2
164    ld1        {v0.s}[0], [x2], x3
165    ld1        {v1.s}[0], [x4], x5
166    weight_\ext v4.8h,  v0.8b,  v1.8b
167    ld1        {v2.s}[0], [x2], x3
168    ld1        {v3.s}[0], [x4], x5
169    sqrshrun    v0.8b,  v4.8h,  #6
170    weight_\ext v5.8h,  v2.8b,  v3.8b
171    st1        {v0.s}[0], [x0], x1
172    sqrshrun    v1.8b,  v5.8h,  #6
173    st1        {v1.s}[0], [x0], x1
174    b.gt        1b
175    ret
176endfunc
177
178function pixel_avg_weight_w8_\ext\()_neon
179    load_weights_\ext
180    dup         v30.8b, w6
181    dup         v31.8b, w7
1821:  // height loop
183    subs        w9,  w9,  #4
184    ld1        {v0.8b}, [x2], x3
185    ld1        {v1.8b}, [x4], x5
186    weight_\ext v16.8h, v0.8b,  v1.8b
187    ld1        {v2.8b}, [x2], x3
188    ld1        {v3.8b}, [x4], x5
189    weight_\ext v17.8h, v2.8b,  v3.8b
190    ld1        {v4.8b}, [x2], x3
191    ld1        {v5.8b}, [x4], x5
192    weight_\ext v18.8h, v4.8b,  v5.8b
193    ld1        {v6.8b}, [x2], x3
194    ld1        {v7.8b}, [x4], x5
195    weight_\ext v19.8h, v6.8b,  v7.8b
196    sqrshrun    v0.8b,  v16.8h, #6
197    sqrshrun    v1.8b,  v17.8h, #6
198    sqrshrun    v2.8b,  v18.8h, #6
199    sqrshrun    v3.8b,  v19.8h, #6
200    st1        {v0.8b}, [x0], x1
201    st1        {v1.8b}, [x0], x1
202    st1        {v2.8b}, [x0], x1
203    st1        {v3.8b}, [x0], x1
204    b.gt        1b
205    ret
206endfunc
207
208function pixel_avg_weight_w16_\ext\()_neon
209    load_weights_\ext
210    dup         v30.16b, w6
211    dup         v31.16b, w7
2121:  // height loop
213    subs        w9,  w9,  #2
214    ld1        {v0.16b}, [x2], x3
215    ld1        {v1.16b}, [x4], x5
216    weight_\ext v16.8h, v0.8b,  v1.8b
217    weight_\ext v17.8h, v0.16b, v1.16b, 2
218    ld1        {v2.16b}, [x2], x3
219    ld1        {v3.16b}, [x4], x5
220    weight_\ext v18.8h, v2.8b,  v3.8b
221    weight_\ext v19.8h, v2.16b, v3.16b, 2
222    sqrshrun    v0.8b,  v16.8h, #6
223    sqrshrun    v1.8b,  v18.8h, #6
224    sqrshrun2   v0.16b, v17.8h, #6
225    sqrshrun2   v1.16b, v19.8h, #6
226    st1        {v0.16b}, [x0], x1
227    st1        {v1.16b}, [x0], x1
228    b.gt        1b
229    ret
230endfunc
231.endm
232
233AVG_WEIGHT add_add
234AVG_WEIGHT add_sub
235AVG_WEIGHT sub_add
236
237function pixel_avg_w4_neon
2381:  subs        w9,  w9,  #2
239    ld1        {v0.s}[0], [x2], x3
240    ld1        {v2.s}[0], [x4], x5
241    urhadd      v0.8b,  v0.8b,  v2.8b
242    ld1        {v1.s}[0], [x2], x3
243    ld1        {v3.s}[0], [x4], x5
244    urhadd      v1.8b,  v1.8b,  v3.8b
245    st1        {v0.s}[0], [x0], x1
246    st1        {v1.s}[0], [x0], x1
247    b.gt        1b
248    ret
249endfunc
250
251function pixel_avg_w8_neon
2521:  subs        w9,  w9,  #4
253    ld1        {v0.8b}, [x2], x3
254    ld1        {v1.8b}, [x4], x5
255    ld1        {v2.8b}, [x2], x3
256    urhadd      v0.8b,  v0.8b,  v1.8b
257    ld1        {v3.8b}, [x4], x5
258    st1        {v0.8b}, [x0], x1
259    ld1        {v4.8b}, [x2], x3
260    urhadd      v1.8b,  v2.8b,  v3.8b
261    ld1        {v5.8b}, [x4], x5
262    st1        {v1.8b}, [x0], x1
263    ld1        {v6.8b}, [x2], x3
264    ld1        {v7.8b}, [x4], x5
265    urhadd      v2.8b,  v4.8b,  v5.8b
266    urhadd      v3.8b,  v6.8b,  v7.8b
267    st1        {v2.8b}, [x0], x1
268    st1        {v3.8b}, [x0], x1
269    b.gt        1b
270    ret
271endfunc
272
273function pixel_avg_w16_neon
2741:  subs        w9,  w9,  #4
275    ld1        {v0.16b}, [x2], x3
276    ld1        {v1.16b}, [x4], x5
277    ld1        {v2.16b}, [x2], x3
278    urhadd      v0.16b, v0.16b, v1.16b
279    ld1        {v3.16b}, [x4], x5
280    st1        {v0.16b}, [x0], x1
281    ld1        {v4.16b}, [x2], x3
282    urhadd      v1.16b, v2.16b, v3.16b
283    ld1        {v5.16b}, [x4], x5
284    st1        {v1.16b}, [x0], x1
285    ld1        {v6.16b}, [x2], x3
286    ld1        {v7.16b}, [x4], x5
287    urhadd      v2.16b, v4.16b, v5.16b
288    urhadd      v3.16b, v6.16b, v7.16b
289    st1        {v2.16b}, [x0], x1
290    st1        {v3.16b}, [x0], x1
291    b.gt        1b
292    ret
293endfunc
294
295function x264_pixel_avg2_w4_neon, export=1
2961:
297    subs        w5,  w5,  #2
298    ld1        {v0.s}[0],  [x2], x3
299    ld1        {v2.s}[0],  [x4], x3
300    urhadd      v0.8b,  v0.8b,  v2.8b
301    ld1        {v1.s}[0],  [x2], x3
302    ld1        {v3.s}[0],  [x4], x3
303    urhadd      v1.8b,  v1.8b,  v3.8b
304    st1        {v0.s}[0], [x0], x1
305    st1        {v1.s}[0], [x0], x1
306    b.gt        1b
307    ret
308endfunc
309
310function x264_pixel_avg2_w8_neon, export=1
3111:
312    subs        w5,  w5,  #2
313    ld1        {v0.8b}, [x2], x3
314    ld1        {v2.8b}, [x4], x3
315    urhadd      v0.8b,  v0.8b,  v2.8b
316    ld1        {v1.8b}, [x2], x3
317    ld1        {v3.8b}, [x4], x3
318    urhadd      v1.8b,  v1.8b,  v3.8b
319    st1        {v0.8b}, [x0], x1
320    st1        {v1.8b}, [x0], x1
321    b.gt        1b
322    ret
323endfunc
324
325function x264_pixel_avg2_w16_neon, export=1
3261:
327    subs        w5,  w5,  #2
328    ld1        {v0.16b}, [x2], x3
329    ld1        {v2.16b}, [x4], x3
330    urhadd      v0.16b, v0.16b, v2.16b
331    ld1        {v1.16b}, [x2], x3
332    ld1        {v3.16b}, [x4], x3
333    urhadd      v1.16b, v1.16b, v3.16b
334    st1        {v0.16b}, [x0], x1
335    st1        {v1.16b}, [x0], x1
336    b.gt        1b
337    ret
338endfunc
339
340function x264_pixel_avg2_w20_neon, export=1
341    sub         x1,  x1,  #16
3421:
343    subs        w5,  w5,  #2
344    ld1        {v0.16b,v1.16b}, [x2], x3
345    ld1        {v2.16b,v3.16b}, [x4], x3
346    urhadd      v0.16b, v0.16b, v2.16b
347    urhadd      v1.8b,  v1.8b,  v3.8b
348    ld1        {v4.16b,v5.16b}, [x2], x3
349    ld1        {v6.16b,v7.16b}, [x4], x3
350    urhadd      v4.16b, v4.16b, v6.16b
351    urhadd      v5.8b,  v5.8b,  v7.8b
352    st1        {v0.16b},  [x0], #16
353    st1        {v1.s}[0], [x0], x1
354    st1        {v4.16b},  [x0], #16
355    st1        {v5.s}[0], [x0], x1
356    b.gt        1b
357    ret
358endfunc
359
360.macro weight_prologue type
361    mov         w9,  w5                 // height
362.ifc \type, full
363    ldr         w12, [x4, #32]          // denom
364.endif
365    ldp         w4,  w5,  [x4, #32+4]   // scale, offset
366    dup         v0.16b, w4
367    dup         v1.8h,  w5
368.ifc \type, full
369    neg         w12, w12
370    dup         v2.8h,  w12
371.endif
372.endm
373
374// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
375//                 intptr_t dst_stride, const x264_weight_t *weight, int h )
376function x264_mc_weight_w20_neon, export=1
377    weight_prologue full
378    sub         x1,  x1,  #16
3791:
380    subs        w9,  w9,  #2
381    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
382    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
383    umull       v22.8h, v16.8b, v0.8b
384    umull       v23.8h, v17.8b, v0.8b
385    zip1        v18.2s, v18.2s, v21.2s
386    umull       v25.8h, v19.8b, v0.8b
387    umull       v26.8h, v20.8b, v0.8b
388    umull       v24.8h, v18.8b, v0.8b
389    srshl       v22.8h, v22.8h, v2.8h
390    srshl       v23.8h, v23.8h, v2.8h
391    srshl       v24.8h, v24.8h, v2.8h
392    srshl       v25.8h, v25.8h, v2.8h
393    srshl       v26.8h, v26.8h, v2.8h
394    add         v22.8h, v22.8h, v1.8h
395    add         v23.8h, v23.8h, v1.8h
396    add         v24.8h, v24.8h, v1.8h
397    add         v25.8h, v25.8h, v1.8h
398    add         v26.8h, v26.8h, v1.8h
399    sqxtun      v4.8b,  v22.8h
400    sqxtun2     v4.16b, v23.8h
401    sqxtun      v6.8b,  v24.8h
402    sqxtun      v5.8b,  v25.8h
403    sqxtun2     v5.16b, v26.8h
404    st1        {v4.16b},  [x0], #16
405    st1        {v6.s}[0], [x0], x1
406    st1        {v5.16b},  [x0], #16
407    st1        {v6.s}[1], [x0], x1
408    b.gt        1b
409    ret
410endfunc
411
412function x264_mc_weight_w16_neon, export=1
413    weight_prologue full
414weight16_loop:
4151:
416    subs        w9,  w9,  #2
417    ld1        {v4.16b}, [x2], x3
418    ld1        {v5.16b}, [x2], x3
419    umull       v22.8h, v4.8b,  v0.8b
420    umull2      v23.8h, v4.16b, v0.16b
421    umull       v24.8h, v5.8b,  v0.8b
422    umull2      v25.8h, v5.16b, v0.16b
423    srshl       v22.8h, v22.8h, v2.8h
424    srshl       v23.8h, v23.8h, v2.8h
425    srshl       v24.8h, v24.8h, v2.8h
426    srshl       v25.8h, v25.8h, v2.8h
427    add         v22.8h, v22.8h, v1.8h
428    add         v23.8h, v23.8h, v1.8h
429    add         v24.8h, v24.8h, v1.8h
430    add         v25.8h, v25.8h, v1.8h
431    sqxtun      v4.8b,  v22.8h
432    sqxtun2     v4.16b, v23.8h
433    sqxtun      v5.8b,  v24.8h
434    sqxtun2     v5.16b, v25.8h
435    st1        {v4.16b}, [x0], x1
436    st1        {v5.16b}, [x0], x1
437    b.gt        1b
438    ret
439endfunc
440
441function x264_mc_weight_w8_neon, export=1
442    weight_prologue full
4431:
444    subs        w9,  w9,  #2
445    ld1        {v16.8b}, [x2], x3
446    ld1        {v17.8b}, [x2], x3
447    umull       v4.8h,  v16.8b, v0.8b
448    umull       v5.8h,  v17.8b, v0.8b
449    srshl       v4.8h,  v4.8h,  v2.8h
450    srshl       v5.8h,  v5.8h,  v2.8h
451    add         v4.8h,  v4.8h,  v1.8h
452    add         v5.8h,  v5.8h,  v1.8h
453    sqxtun      v16.8b, v4.8h
454    sqxtun      v17.8b, v5.8h
455    st1        {v16.8b}, [x0], x1
456    st1        {v17.8b}, [x0], x1
457    b.gt        1b
458    ret
459endfunc
460
461function x264_mc_weight_w4_neon, export=1
462    weight_prologue full
4631:
464    subs        w9,  w9,  #2
465    ld1        {v16.s}[0], [x2], x3
466    ld1        {v16.s}[1], [x2], x3
467    umull       v4.8h,  v16.8b, v0.8b
468    srshl       v4.8h,  v4.8h,  v2.8h
469    add         v4.8h,  v4.8h,  v1.8h
470    sqxtun      v16.8b, v4.8h
471    st1        {v16.s}[0], [x0], x1
472    st1        {v16.s}[1], [x0], x1
473    b.gt        1b
474    ret
475endfunc
476
477function x264_mc_weight_w20_nodenom_neon, export=1
478    weight_prologue nodenom
479    sub         x1,  x1,  #16
4801:
481    subs        w9,  w9,  #2
482    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
483    mov         v27.16b, v1.16b
484    mov         v28.16b, v1.16b
485    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
486    mov         v31.16b, v1.16b
487    mov         v29.16b, v1.16b
488    mov         v30.16b, v1.16b
489    zip1        v18.2s, v18.2s, v21.2s
490    umlal       v27.8h, v16.8b, v0.8b
491    umlal       v28.8h, v17.8b, v0.8b
492    umlal       v31.8h, v18.8b, v0.8b
493    umlal       v29.8h, v19.8b, v0.8b
494    umlal       v30.8h, v20.8b, v0.8b
495    sqxtun      v4.8b,  v27.8h
496    sqxtun2     v4.16b, v28.8h
497    sqxtun      v5.8b,  v29.8h
498    sqxtun2     v5.16b, v30.8h
499    sqxtun      v6.8b,  v31.8h
500    st1        {v4.16b},  [x0], #16
501    st1        {v6.s}[0], [x0], x1
502    st1        {v5.16b},  [x0], #16
503    st1        {v6.s}[1], [x0], x1
504    b.gt        1b
505    ret
506endfunc
507
508function x264_mc_weight_w16_nodenom_neon, export=1
509    weight_prologue nodenom
5101:
511    subs        w9,  w9,  #2
512    ld1        {v6.16b},  [x2], x3
513    mov         v27.16b, v1.16b
514    mov         v28.16b, v1.16b
515    ld1        {v7.16b},  [x2], x3
516    mov         v29.16b, v1.16b
517    mov         v30.16b, v1.16b
518    umlal       v27.8h, v6.8b,  v0.8b
519    umlal2      v28.8h, v6.16b, v0.16b
520    umlal       v29.8h, v7.8b,  v0.8b
521    umlal2      v30.8h, v7.16b, v0.16b
522    sqxtun      v4.8b,  v27.8h
523    sqxtun2     v4.16b, v28.8h
524    sqxtun      v5.8b,  v29.8h
525    sqxtun2     v5.16b, v30.8h
526    st1        {v4.16b},  [x0], x1
527    st1        {v5.16b},  [x0], x1
528    b.gt        1b
529    ret
530endfunc
531
532function x264_mc_weight_w8_nodenom_neon, export=1
533    weight_prologue nodenom
5341:
535    subs        w9,  w9,  #2
536    ld1        {v16.8b}, [x2], x3
537    mov         v27.16b, v1.16b
538    ld1        {v17.8b}, [x2], x3
539    mov         v29.16b, v1.16b
540    umlal       v27.8h, v16.8b, v0.8b
541    umlal       v29.8h, v17.8b, v0.8b
542    sqxtun      v4.8b,  v27.8h
543    sqxtun      v5.8b,  v29.8h
544    st1        {v4.8b},  [x0], x1
545    st1        {v5.8b},  [x0], x1
546    b.gt        1b
547    ret
548endfunc
549
550function x264_mc_weight_w4_nodenom_neon, export=1
551    weight_prologue nodenom
5521:
553    subs        w9,  w9,  #2
554    ld1        {v16.s}[0], [x2], x3
555    ld1        {v16.s}[1], [x2], x3
556    mov         v27.16b, v1.16b
557    umlal       v27.8h, v16.8b, v0.8b
558    sqxtun      v4.8b,  v27.8h
559    st1        {v4.s}[0],  [x0], x1
560    st1        {v4.s}[1],  [x0], x1
561    b.gt        1b
562    ret
563endfunc
564
565.macro weight_simple_prologue
566    ldr         w6,  [x4]               // offset
567    dup         v1.16b,  w6
568.endm
569
570.macro weight_simple name op
571function x264_mc_weight_w20_\name\()_neon, export=1
572    weight_simple_prologue
5731:
574    subs        w5,  w5,  #2
575    ldr         s18, [x2, #16]
576    ld1        {v16.16b}, [x2], x3
577    ldr         s19, [x2, #16]
578    ld1        {v17.16b}, [x2], x3
579    \op         v18.8b,  v18.8b,  v1.8b
580    \op         v16.16b, v16.16b, v1.16b
581    \op         v19.8b,  v19.8b,  v1.8b
582    \op         v17.16b, v17.16b, v1.16b
583    str         s18, [x0, #16]
584    st1        {v16.16b}, [x0], x1
585    str         s19, [x0, #16]
586    st1        {v17.16b}, [x0], x1
587    b.gt        1b
588    ret
589endfunc
590
591function x264_mc_weight_w16_\name\()_neon, export=1
592    weight_simple_prologue
5931:
594    subs        w5,  w5,  #2
595    ld1        {v16.16b}, [x2], x3
596    ld1        {v17.16b}, [x2], x3
597    \op         v16.16b, v16.16b, v1.16b
598    \op         v17.16b, v17.16b, v1.16b
599    st1        {v16.16b}, [x0], x1
600    st1        {v17.16b}, [x0], x1
601    b.gt        1b
602    ret
603endfunc
604
605function x264_mc_weight_w8_\name\()_neon, export=1
606    weight_simple_prologue
6071:
608    subs        w5,  w5,  #2
609    ld1        {v16.8b}, [x2], x3
610    ld1        {v17.8b}, [x2], x3
611    \op         v16.8b, v16.8b, v1.8b
612    \op         v17.8b, v17.8b, v1.8b
613    st1        {v16.8b}, [x0], x1
614    st1        {v17.8b}, [x0], x1
615    b.gt        1b
616    ret
617endfunc
618
619function x264_mc_weight_w4_\name\()_neon, export=1
620    weight_simple_prologue
6211:
622    subs        w5,  w5,  #2
623    ld1        {v16.s}[0], [x2], x3
624    ld1        {v16.s}[1], [x2], x3
625    \op         v16.8b, v16.8b, v1.8b
626    st1        {v16.s}[0], [x0], x1
627    st1        {v16.s}[1], [x0], x1
628    b.gt        1b
629    ret
630endfunc
631.endm
632
633weight_simple offsetadd, uqadd
634weight_simple offsetsub, uqsub
635
636
637// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
638function x264_mc_copy_w4_neon, export=1
6391:
640    subs        w4,  w4,  #4
641    ld1        {v0.s}[0],  [x2],  x3
642    ld1        {v1.s}[0],  [x2],  x3
643    ld1        {v2.s}[0],  [x2],  x3
644    ld1        {v3.s}[0],  [x2],  x3
645    st1        {v0.s}[0],  [x0],  x1
646    st1        {v1.s}[0],  [x0],  x1
647    st1        {v2.s}[0],  [x0],  x1
648    st1        {v3.s}[0],  [x0],  x1
649    b.gt        1b
650    ret
651endfunc
652
653function x264_mc_copy_w8_neon, export=1
6541:  subs        w4,  w4,  #4
655    ld1        {v0.8b},  [x2],  x3
656    ld1        {v1.8b},  [x2],  x3
657    ld1        {v2.8b},  [x2],  x3
658    ld1        {v3.8b},  [x2],  x3
659    st1        {v0.8b},  [x0],  x1
660    st1        {v1.8b},  [x0],  x1
661    st1        {v2.8b},  [x0],  x1
662    st1        {v3.8b},  [x0],  x1
663    b.gt        1b
664    ret
665endfunc
666
667function x264_mc_copy_w16_neon, export=1
6681:  subs        w4,  w4,  #4
669    ld1        {v0.16b}, [x2],  x3
670    ld1        {v1.16b}, [x2],  x3
671    ld1        {v2.16b}, [x2],  x3
672    ld1        {v3.16b}, [x2],  x3
673    st1        {v0.16b}, [x0],  x1
674    st1        {v1.16b}, [x0],  x1
675    st1        {v2.16b}, [x0],  x1
676    st1        {v3.16b}, [x0],  x1
677    b.gt        1b
678    ret
679endfunc
680
681// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
682//                           intptr_t i_dst_stride,
683//                           uint8_t *src, intptr_t i_src_stride,
684//                           int dx, int dy, int i_width, int i_height );
685function x264_mc_chroma_neon, export=1
686    ldr         w15, [sp]               // height
687    sbfx        x12, x6,  #3,  #29      // asr(3) and sign extend
688    sbfx        x11, x5,  #3,  #29      // asr(3) and sign extend
689    cmp         w7,  #4
690    mul         x12, x12, x4
691    add         x3,  x3,  x11, lsl #1
692
693    and         w5,  w5,  #7
694    and         w6,  w6,  #7
695
696    add         x3,  x3,  x12
697
698    //pld             [x3]
699    //pld             [x3, x4]
700
701    b.gt        mc_chroma_w8_neon
702    b.eq        mc_chroma_w4_neon
703endfunc
704
705.macro CHROMA_MC_START r00, r01, r10, r11
706    mul         w12, w5,  w6            // cD = d8x    *d8y
707    lsl         w13, w5,  #3
708    add         w9,  w12,  #64
709    lsl         w14, w6,  #3
710    tst         w12, w12
711    sub         w9,  w9,  w13
712    sub         w10, w13, w12           // cB = d8x    *(8-d8y);
713    sub         w11, w14, w12           // cC = (8-d8x)*d8y
714    sub         w9,  w9,  w14           // cA = (8-d8x)*(8-d8y);
715.endm
716
717.macro CHROMA_MC width, vsize
718function mc_chroma_w\width\()_neon
719// since the element size varies, there's a different index for the 2nd store
720.if \width == 4
721    .set st2, 1
722.else
723    .set st2, 2
724.endif
725    CHROMA_MC_START
726    b.eq        2f
727
728    ld2        {v28.8b,v29.8b}, [x3], x4
729    dup         v0.8b,  w9               // cA
730    dup         v1.8b,  w10              // cB
731
732    ext         v6.8b, v28.8b, v6.8b,  #1
733    ext         v7.8b, v29.8b, v7.8b,  #1
734
735    ld2        {v30.8b,v31.8b}, [x3], x4
736    dup         v2.8b,  w11              // cC
737    dup         v3.8b,  w12              // cD
738
739    ext         v22.8b, v30.8b, v22.8b,  #1
740    ext         v23.8b, v31.8b, v23.8b,  #1
741
742    trn1        v0.2s,  v0.2s,  v1.2s
743    trn1        v2.2s,  v2.2s,  v3.2s
744
745    trn1        v4.2s,  v28.2s, v6.2s
746    trn1        v5.2s,  v29.2s, v7.2s
747    trn1        v20.2s, v30.2s, v22.2s
748    trn1        v21.2s, v31.2s, v23.2s
7491:  // height loop, interpolate xy
750    subs        w15, w15, #2
751    umull       v16.8h, v4.8b,  v0.8b
752    umlal       v16.8h, v20.8b, v2.8b
753    umull       v17.8h, v5.8b,  v0.8b
754    umlal       v17.8h, v21.8b, v2.8b
755
756    ld2        {v28.8b,v29.8b}, [x3], x4
757    transpose   v24.2d, v25.2d, v16.2d, v17.2d
758
759    ext         v6.8b, v28.8b, v6.8b,  #1
760    ext         v7.8b, v29.8b, v7.8b,  #1
761
762    trn1        v4.2s,  v28.2s, v6.2s
763    trn1        v5.2s,  v29.2s, v7.2s
764
765    add         v16.8h, v24.8h, v25.8h
766
767    umull       v18.8h, v20.8b, v0.8b
768    umlal       v18.8h, v4.8b,  v2.8b
769    umull       v19.8h, v21.8b, v0.8b
770    umlal       v19.8h, v5.8b,  v2.8b
771
772    ld2        {v30.8b,v31.8b}, [x3], x4
773    transpose   v26.2d, v27.2d, v18.2d, v19.2d
774
775    ext         v22.8b, v30.8b, v22.8b,  #1
776    ext         v23.8b, v31.8b, v23.8b,  #1
777    trn1        v20.2s, v30.2s, v22.2s
778    trn1        v21.2s, v31.2s, v23.2s
779
780    add         v17.8h, v26.8h, v27.8h
781
782    rshrn       v16.8b, v16.8h, #6
783    rshrn       v17.8b, v17.8h, #6
784
785    //pld         [x3]
786    //pld         [x3, x4]
787
788    st1        {v16.\vsize}[0],   [x0], x2
789    st1        {v16.\vsize}[st2], [x1], x2
790    st1        {v17.\vsize}[0],   [x0], x2
791    st1        {v17.\vsize}[st2], [x1], x2
792    b.gt        1b
793
794    ret
7952:  // dx or dy are 0
796    tst         w11, w11
797    add         w10, w10,  w11
798    dup         v0.8b,  w9
799    dup         v1.8b,  w10
800
801    b.eq        4f
802
803    ld1        {v4.8b}, [x3], x4
804    ld1        {v6.8b}, [x3], x4
8053:  // vertical interpolation loop
806    subs        w15, w15, #2
807    umull       v16.8h, v4.8b,  v0.8b
808    ld1        {v4.8b}, [x3], x4
809    umlal       v16.8h, v6.8b,  v1.8b
810    umull       v17.8h, v6.8b,  v0.8b
811    ld1        {v6.8b}, [x3], x4
812    umlal       v17.8h, v4.8b,  v1.8b
813
814    rshrn       v20.8b, v16.8h, #6      // uvuvuvuv
815    rshrn       v21.8b, v17.8h, #6      // uvuvuvuv
816
817    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
818    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
819
820    //pld         [x3]
821    //pld         [x3, x4]
822
823    st1        {v16.\vsize}[0],   [x0], x2
824    st1        {v16.\vsize}[st2], [x0], x2
825    st1        {v17.\vsize}[0],   [x1], x2
826    st1        {v17.\vsize}[st2], [x1], x2
827    b.gt        3b
828
829    ret
830
8314:  // dy is 0
832    ld1        {v4.8b,v5.8b}, [x3], x4
833    ld1        {v6.8b,v7.8b}, [x3], x4
834
835    ext         v5.8b,  v4.8b,  v5.8b,  #2
836    ext         v7.8b,  v6.8b,  v7.8b,  #2
8375:  // horizontal interpolation loop
838    subs        w15, w15, #2
839    umull       v16.8h, v4.8b,  v0.8b
840    umlal       v16.8h, v5.8b,  v1.8b
841    umull       v17.8h, v6.8b,  v0.8b
842    umlal       v17.8h, v7.8b,  v1.8b
843
844    ld1        {v4.8b,v5.8b}, [x3], x4
845    ld1        {v6.8b,v7.8b}, [x3], x4
846    rshrn       v20.8b, v16.8h, #6
847    rshrn       v21.8b, v17.8h, #6
848    ext         v5.8b,  v4.8b,  v5.8b,  #2
849    ext         v7.8b,  v6.8b,  v7.8b,  #2
850    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
851    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
852
853    //pld         [x3]
854    //pld         [x3, x4]
855
856    st1        {v16.\vsize}[0],   [x0], x2
857    st1        {v16.\vsize}[st2], [x0], x2
858    st1        {v17.\vsize}[0],   [x1], x2
859    st1        {v17.\vsize}[st2], [x1], x2
860    b.gt        5b
861
862    ret
863endfunc
864.endm
865
866    CHROMA_MC 2, h
867    CHROMA_MC 4, s
868
869function mc_chroma_w8_neon
870    CHROMA_MC_START
871    b.eq        2f
872    ld2        {v4.16b,v5.16b}, [x3], x4
873    ld2        {v20.16b,v21.16b}, [x3], x4
874    dup         v0.8b, w9               // cA
875    dup         v1.8b, w10              // cB
876
877    ext         v6.16b, v4.16b, v4.16b, #1
878    ext         v7.16b, v5.16b, v5.16b, #1
879
880    dup         v2.8b, w11              // cC
881    dup         v3.8b, w12              // cD
882
883    ext         v22.16b, v20.16b, v20.16b, #1
884    ext         v23.16b, v21.16b, v21.16b, #1
885
8861:  // height loop, interpolate xy
887    subs        w15, w15, #2
888    umull       v16.8h, v4.8b,  v0.8b
889    umlal       v16.8h, v6.8b,  v1.8b
890    umlal       v16.8h, v20.8b, v2.8b
891    umlal       v16.8h, v22.8b, v3.8b
892
893    umull       v17.8h, v5.8b,  v0.8b
894    umlal       v17.8h, v7.8b,  v1.8b
895    umlal       v17.8h, v21.8b, v2.8b
896    umlal       v17.8h, v23.8b, v3.8b
897
898    ld2        {v4.16b,v5.16b}, [x3], x4
899
900    ext         v6.16b, v4.16b, v4.16b, #1
901    ext         v7.16b, v5.16b, v5.16b, #1
902
903    umull       v18.8h, v20.8b, v0.8b
904    umlal       v18.8h, v22.8b, v1.8b
905    umlal       v18.8h, v4.8b,  v2.8b
906    umlal       v18.8h, v6.8b,  v3.8b
907
908    umull       v19.8h, v21.8b, v0.8b
909    umlal       v19.8h, v23.8b, v1.8b
910    umlal       v19.8h, v5.8b,  v2.8b
911    umlal       v19.8h, v7.8b,  v3.8b
912
913    ld2        {v20.16b,v21.16b}, [x3], x4
914
915    rshrn       v16.8b, v16.8h, #6
916    rshrn       v17.8b, v17.8h, #6
917    rshrn       v18.8b, v18.8h, #6
918    rshrn       v19.8b, v19.8h, #6
919
920    ext         v22.16b, v20.16b, v20.16b, #1
921    ext         v23.16b, v21.16b, v21.16b, #1
922
923    //pld         [x3]
924    //pld         [x3, x4]
925
926    st1        {v16.8b}, [x0], x2
927    st1        {v17.8b}, [x1], x2
928    st1        {v18.8b}, [x0], x2
929    st1        {v19.8b}, [x1], x2
930    b.gt        1b
931
932    ret
9332:  // dx or dy are 0
934    tst         w11, w11
935    add         w10, w10, w11
936    dup         v0.8b, w9
937    dup         v1.8b, w10
938
939    b.eq        4f
940
941    ld2        {v4.8b,v5.8b}, [x3], x4
942    ld2        {v6.8b,v7.8b}, [x3], x4
9433:  // vertical interpolation loop
944    subs        w15, w15, #2
945    umull       v16.8h, v4.8b,  v0.8b //U
946    umlal       v16.8h, v6.8b,  v1.8b
947    umull       v17.8h, v5.8b,  v0.8b //V
948    umlal       v17.8h, v7.8b,  v1.8b
949
950    ld2        {v4.8b,v5.8b}, [x3], x4
951
952    umull       v18.8h, v6.8b,  v0.8b
953    umlal       v18.8h, v4.8b,  v1.8b
954    umull       v19.8h, v7.8b,  v0.8b
955    umlal       v19.8h, v5.8b,  v1.8b
956
957    ld2        {v6.8b,v7.8b}, [x3], x4
958
959    rshrn       v16.8b, v16.8h, #6
960    rshrn       v17.8b, v17.8h, #6
961    rshrn       v18.8b, v18.8h, #6
962    rshrn       v19.8b, v19.8h, #6
963
964    //pld         [x3]
965    //pld         [x3, x4]
966
967    st1        {v16.8b}, [x0], x2
968    st1        {v17.8b}, [x1], x2
969    st1        {v18.8b}, [x0], x2
970    st1        {v19.8b}, [x1], x2
971    b.gt        3b
972
973    ret
9744:  // dy is 0
975    ld2        {v4.16b,v5.16b}, [x3], x4
976    ext         v6.16b, v4.16b, v4.16b, #1
977    ext         v7.16b, v5.16b, v5.16b, #1
978    ld2        {v20.16b,v21.16b}, [x3], x4
979    ext         v22.16b, v20.16b, v20.16b, #1
980    ext         v23.16b, v21.16b, v21.16b, #1
9815:  // horizontal interpolation loop
982    subs        w15, w15, #2
983    umull       v16.8h, v4.8b,  v0.8b //U
984    umlal       v16.8h, v6.8b,  v1.8b
985    umull       v17.8h, v5.8b,  v0.8b //V
986    umlal       v17.8h, v7.8b,  v1.8b
987
988    ld2        {v4.16b,v5.16b}, [x3], x4
989
990    umull       v18.8h, v20.8b, v0.8b
991    umlal       v18.8h, v22.8b, v1.8b
992    umull       v19.8h, v21.8b, v0.8b
993    umlal       v19.8h, v23.8b, v1.8b
994
995    ld2        {v20.16b,v21.16b}, [x3], x4
996
997    rshrn       v16.8b, v16.8h, #6
998    rshrn       v17.8b, v17.8h, #6
999    rshrn       v18.8b, v18.8h, #6
1000    rshrn       v19.8b, v19.8h, #6
1001
1002    ext         v6.16b, v4.16b, v4.16b, #1
1003    ext         v7.16b, v5.16b, v5.16b, #1
1004    ext         v22.16b, v20.16b, v20.16b, #1
1005    ext         v23.16b, v21.16b, v21.16b, #1
1006
1007    //pld         [x3]
1008    //pld         [x3, x4]
1009
1010    st1        {v16.8b}, [x0], x2
1011    st1        {v17.8b}, [x1], x2
1012    st1        {v18.8b}, [x0], x2
1013    st1        {v19.8b}, [x1], x2
1014    b.gt        5b
1015
1016    ret
1017endfunc
1018
1019//void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
1020//                  intptr_t stride, int width, int height, int16_t *buf )
1021function x264_hpel_filter_neon, export=1
1022    ubfm        x9,  x3,  #0,  #3
1023    add         w15, w5,  w9
1024    sub         x13, x3,  x9            // align src
1025    sub         x10, x0,  x9
1026    sub         x11, x1,  x9
1027    sub         x12, x2,  x9
1028    movi        v30.16b,  #5
1029    movi        v31.16b,  #20
10301:  // line start
1031    mov         x3,  x13
1032    mov         x2,  x12
1033    mov         x1,  x11
1034    mov         x0,  x10
1035    add         x7,  x3,  #16           // src pointer next 16b for horiz filter
1036    mov         x5,  x15                // restore width
1037    sub         x3,  x3,  x4,  lsl #1   // src - 2*stride
1038    ld1        {v28.16b}, [x7], #16     // src[16:31]
1039
1040    add         x9,  x3,  x5            // holds src - 2*stride + width
1041
1042    ld1        {v16.16b}, [x3], x4      // src-2*stride[0:15]
1043    ld1        {v17.16b}, [x3], x4      // src-1*stride[0:15]
1044    ld1        {v18.16b}, [x3], x4      // src+0*stride[0:15]
1045    ld1        {v19.16b}, [x3], x4      // src+1*stride[0:15]
1046    ld1        {v20.16b}, [x3], x4      // src+2*stride[0:15]
1047    ld1        {v21.16b}, [x3], x4      // src+3*stride[0:15]
1048
1049    ext         v22.16b, v7.16b,  v18.16b, #14
1050    uaddl       v1.8h,   v16.8b,  v21.8b
1051    ext         v26.16b, v18.16b, v28.16b, #3
1052    umlsl       v1.8h,   v17.8b,  v30.8b
1053    ext         v23.16b, v7.16b,  v18.16b, #15
1054    umlal       v1.8h,   v18.8b,  v31.8b
1055    ext         v24.16b, v18.16b, v28.16b, #1
1056    umlal       v1.8h,   v19.8b,  v31.8b
1057    ext         v25.16b, v18.16b, v28.16b, #2
1058    umlsl       v1.8h,   v20.8b,  v30.8b
10592:  // next 16 pixel of line
1060    subs        x5,  x5,  #16
1061    sub         x3,  x9,  x5            // src - 2*stride += 16
1062
1063    uaddl       v4.8h,  v22.8b,  v26.8b
1064    uaddl2      v5.8h,  v22.16b, v26.16b
1065    sqrshrun    v6.8b,  v1.8h,   #5
1066    umlsl       v4.8h,  v23.8b,  v30.8b
1067    umlsl2      v5.8h,  v23.16b, v30.16b
1068    umlal       v4.8h,  v18.8b,  v31.8b
1069    umlal2      v5.8h,  v18.16b, v31.16b
1070    umlal       v4.8h,  v24.8b,  v31.8b
1071    umlal2      v5.8h,  v24.16b, v31.16b
1072    umlsl       v4.8h,  v25.8b,  v30.8b
1073    umlsl2      v5.8h,  v25.16b, v30.16b
1074
1075    uaddl2      v2.8h,  v16.16b, v21.16b
1076    sqrshrun    v4.8b,  v4.8h,   #5
1077    mov         v7.16b, v18.16b
1078    sqrshrun2   v4.16b, v5.8h,   #5
1079
1080    umlsl2      v2.8h,  v17.16b, v30.16b
1081    ld1        {v16.16b}, [x3],  x4      // src-2*stride[0:15]
1082    umlal2      v2.8h,  v18.16b, v31.16b
1083    ld1        {v17.16b}, [x3],  x4      // src-1*stride[0:15]
1084    umlal2      v2.8h,  v19.16b, v31.16b
1085    ld1        {v18.16b}, [x3],  x4      // src+0*stride[0:15]
1086    umlsl2      v2.8h,  v20.16b, v30.16b
1087    ld1        {v19.16b}, [x3],  x4      // src+1*stride[0:15]
1088    st1        {v4.16b},  [x0],  #16
1089    sqrshrun2   v6.16b, v2.8h,   #5
1090    ld1        {v20.16b}, [x3],  x4      // src+2*stride[0:15]
1091    ld1        {v21.16b}, [x3],  x4      // src+3*stride[0:15]
1092
1093    ext         v22.16b, v0.16b, v1.16b, #12
1094    ext         v26.16b, v1.16b, v2.16b, #6
1095    ext         v23.16b, v0.16b, v1.16b, #14
1096    st1        {v6.16b},  [x1],  #16
1097    uaddl       v3.8h,   v16.8b, v21.8b
1098    ext         v25.16b, v1.16b, v2.16b, #4
1099    umlsl       v3.8h,   v17.8b, v30.8b
1100    ext         v24.16b, v1.16b, v2.16b, #2
1101
1102    umlal       v3.8h,  v18.8b, v31.8b
1103    add         v4.8h,  v22.8h, v26.8h
1104    umlal       v3.8h,  v19.8b, v31.8b
1105    add         v5.8h,  v23.8h, v25.8h
1106    umlsl       v3.8h,  v20.8b, v30.8b
1107    add         v6.8h,  v24.8h, v1.8h
1108
1109    ext         v22.16b, v1.16b, v2.16b, #12
1110    ext         v26.16b, v2.16b, v3.16b, #6
1111    ext         v23.16b, v1.16b, v2.16b, #14
1112    ext         v25.16b, v2.16b, v3.16b, #4
1113    ext         v24.16b, v2.16b, v3.16b, #2
1114
1115    add         v22.8h, v22.8h, v26.8h
1116    add         v23.8h, v23.8h, v25.8h
1117    add         v24.8h, v24.8h, v2.8h
1118
1119    sub         v4.8h,  v4.8h,  v5.8h   // a-b
1120    sub         v5.8h,  v5.8h,  v6.8h   // b-c
1121
1122    sub         v22.8h, v22.8h, v23.8h  // a-b
1123    sub         v23.8h, v23.8h, v24.8h  // b-c
1124
1125    sshr        v4.8h,  v4.8h,  #2      // (a-b)/4
1126    sshr        v22.8h, v22.8h, #2      // (a-b)/4
1127    sub         v4.8h,  v4.8h,  v5.8h   // (a-b)/4-b+c
1128    sub         v22.8h, v22.8h, v23.8h  // (a-b)/4-b+c
1129    sshr        v4.8h,  v4.8h,  #2      // ((a-b)/4-b+c)/4
1130    sshr        v22.8h, v22.8h, #2      // ((a-b)/4-b+c)/4
1131    add         v4.8h,  v4.8h,  v6.8h   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1132    add         v22.8h, v22.8h, v24.8h  // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
1133
1134    sqrshrun    v4.8b,   v4.8h,   #6
1135    ld1        {v28.16b}, [x7],   #16   // src[16:31]
1136    mov         v0.16b,  v2.16b
1137    ext         v23.16b, v7.16b,  v18.16b, #15
1138    sqrshrun2   v4.16b,  v22.8h,  #6
1139    mov         v1.16b,  v3.16b
1140    ext         v22.16b, v7.16b,  v18.16b, #14
1141    ext         v24.16b, v18.16b, v28.16b, #1
1142    ext         v25.16b, v18.16b, v28.16b, #2
1143    ext         v26.16b, v18.16b, v28.16b, #3
1144
1145    st1        {v4.16b}, [x2], #16
1146    b.gt        2b
1147
1148    subs        w6,  w6,  #1
1149    add         x10,  x10,  x4
1150    add         x11,  x11,  x4
1151    add         x12,  x12,  x4
1152    add         x13,  x13,  x4
1153    b.gt        1b
1154
1155    ret
1156endfunc
1157
1158// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
1159//                         uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
1160//                         intptr_t dst_stride, int width, int height )
1161function x264_frame_init_lowres_core_neon, export=1
1162    ldr         w8,  [sp]
1163    sub         x10, x6,  w7, uxtw      // dst_stride - width
1164    and         x10, x10, #~15
1165
11661:
1167    mov         w9,  w7                 // width
1168    mov         x11, x0                 // src0
1169    add         x12, x0,  x5            // src1 = src0 + src_stride
1170    add         x13, x0,  x5,  lsl #1   // src2 = src1 + src_stride
1171
1172    ld2        {v0.16b,v1.16b}, [x11], #32
1173    ld2        {v2.16b,v3.16b}, [x12], #32
1174    ld2        {v4.16b,v5.16b}, [x13], #32
1175
1176    urhadd      v20.16b, v0.16b,  v2.16b    // s0[2x]   + s1[2x]
1177    urhadd      v22.16b, v2.16b,  v4.16b    // s1[2x]   + s2[2x]
11782:
1179    subs        w9,  w9,  #16
1180    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
1181    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
1182
1183    ld2        {v0.16b,v1.16b}, [x11], #32
1184    ld2        {v2.16b,v3.16b}, [x12], #32
1185    ld2        {v4.16b,v5.16b}, [x13], #32
1186    urhadd      v30.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
1187    urhadd      v31.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
1188    ext         v24.16b, v20.16b, v30.16b, #1   // s0[2x+2] + s1[2x+2]
1189    ext         v25.16b, v22.16b, v31.16b, #1   // s1[2x+2] + s2[2x+2]
1190
1191    urhadd      v16.16b, v20.16b, v21.16b
1192    urhadd      v18.16b, v22.16b, v23.16b
1193    urhadd      v17.16b, v21.16b, v24.16b
1194    urhadd      v19.16b, v23.16b, v25.16b
1195
1196    st1        {v16.16b},   [x1],  #16
1197    st1        {v18.16b},   [x3],  #16
1198    st1        {v17.16b},   [x2],  #16
1199    st1        {v19.16b},   [x4],  #16
1200    b.le        3f
1201
1202    subs        w9,  w9,  #16
1203    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
1204    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
1205
1206    ld2        {v0.16b,v1.16b}, [x11], #32
1207    ld2        {v2.16b,v3.16b}, [x12], #32
1208    ld2        {v4.16b,v5.16b}, [x13], #32
1209    urhadd      v20.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
1210    urhadd      v22.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
1211    ext         v24.16b, v30.16b, v20.16b, #1   // s0[2x+2] + s1[2x+2]
1212    ext         v25.16b, v31.16b, v22.16b, #1   // s1[2x+2] + s2[2x+2]
1213
1214    urhadd      v16.16b, v30.16b, v21.16b
1215    urhadd      v18.16b, v31.16b, v23.16b
1216    urhadd      v17.16b, v21.16b, v24.16b
1217    urhadd      v19.16b, v23.16b, v25.16b
1218
1219    st1        {v16.16b},   [x1],  #16
1220    st1        {v18.16b},   [x3],  #16
1221    st1        {v17.16b},   [x2],  #16
1222    st1        {v19.16b},   [x4],  #16
1223    b.gt        2b
12243:
1225    subs        w8,  w8,  #1
1226    add         x0,  x0,  x5,  lsl #1
1227    add         x1,  x1,  x10
1228    add         x2,  x2,  x10
1229    add         x3,  x3,  x10
1230    add         x4,  x4,  x10
1231    b.gt        1b
1232
1233    ret
1234endfunc
1235
1236function x264_load_deinterleave_chroma_fenc_neon, export=1
1237    mov         x4,  #FENC_STRIDE/2
1238    b           load_deinterleave_chroma
1239endfunc
1240
1241function x264_load_deinterleave_chroma_fdec_neon, export=1
1242    mov         x4,  #FDEC_STRIDE/2
1243load_deinterleave_chroma:
1244    ld2        {v0.8b,v1.8b}, [x1], x2
1245    ld2        {v2.8b,v3.8b}, [x1], x2
1246    subs        w3,  w3,  #2
1247    st1        {v0.8b}, [x0], x4
1248    st1        {v1.8b}, [x0], x4
1249    st1        {v2.8b}, [x0], x4
1250    st1        {v3.8b}, [x0], x4
1251    b.gt        load_deinterleave_chroma
1252
1253    ret
1254endfunc
1255
1256function x264_plane_copy_deinterleave_neon, export=1
1257    add         w9,  w6,  #15
1258    and         w9,  w9,  #0xfffffff0
1259    sub         x1,  x1,  x9
1260    sub         x3,  x3,  x9
1261    sub         x5,  x5,  x9, lsl #1
12621:
1263    ld2        {v0.16b,v1.16b}, [x4], #32
1264    subs        w9,  w9,  #16
1265    st1        {v0.16b}, [x0],  #16
1266    st1        {v1.16b}, [x2],  #16
1267    b.gt        1b
1268
1269    add         x4,  x4,  x5
1270    subs        w7,  w7,  #1
1271    add         x0,  x0,  x1
1272    add         x2,  x2,  x3
1273    mov         w9,  w6
1274    b.gt       1b
1275
1276    ret
1277endfunc
1278
1279.macro deinterleave_rgb
1280    subs            x11, x11, #8
1281    st1            {v0.8b},    [x0], #8
1282    st1            {v1.8b},    [x2], #8
1283    st1            {v2.8b},    [x4], #8
1284    b.gt            1b
1285
1286    subs            w10, w10, #1
1287    add             x0,  x0,  x1
1288    add             x2,  x2,  x3
1289    add             x4,  x4,  x5
1290    add             x6,  x6,  x7
1291    mov             x11, x9
1292    b.gt            1b
1293.endm
1294
1295function x264_plane_copy_deinterleave_rgb_neon, export=1
1296#if SYS_MACOSX
1297    ldr             w8,  [sp]
1298    ldp             w9,  w10, [sp, #4]
1299#else
1300    ldr             x8,  [sp]
1301    ldp             x9,  x10, [sp, #8]
1302#endif
1303    cmp             w8,  #3
1304    uxtw            x9,  w9
1305    add             x11, x9,  #7
1306    and             x11, x11, #~7
1307    sub             x1,  x1,  x11
1308    sub             x3,  x3,  x11
1309    sub             x5,  x5,  x11
1310    b.ne            4f
1311    sub             x7,  x7,  x11, lsl #1
1312    sub             x7,  x7,  x11
13131:
1314    ld3            {v0.8b,v1.8b,v2.8b}, [x6], #24
1315    deinterleave_rgb
1316
1317    ret
13184:
1319    sub             x7,  x7,  x11, lsl #2
13201:
1321    ld4            {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
1322    deinterleave_rgb
1323
1324    ret
1325endfunc
1326
1327function x264_plane_copy_interleave_neon, export=1
1328    add         w9,  w6,  #15
1329    and         w9,  w9,  #0xfffffff0
1330    sub         x1,  x1,  x9,  lsl #1
1331    sub         x3,  x3,  x9
1332    sub         x5,  x5,  x9
13331:
1334    ld1        {v0.16b}, [x2],  #16
1335    ld1        {v1.16b}, [x4],  #16
1336    subs        w9,  w9,  #16
1337    st2        {v0.16b,v1.16b}, [x0],  #32
1338    b.gt        1b
1339
1340    subs        w7,  w7,  #1
1341    add         x0,  x0,  x1
1342    add         x2,  x2,  x3
1343    add         x4,  x4,  x5
1344    mov         w9,  w6
1345    b.gt        1b
1346
1347    ret
1348endfunc
1349
1350function x264_store_interleave_chroma_neon, export=1
1351    mov             x5,  #FDEC_STRIDE
13521:
1353    ld1        {v0.8b}, [x2], x5
1354    ld1        {v1.8b}, [x3], x5
1355    ld1        {v2.8b}, [x2], x5
1356    ld1        {v3.8b}, [x3], x5
1357    subs        w4,  w4,  #2
1358    zip1        v4.16b,  v0.16b,  v1.16b
1359    zip1        v5.16b,  v2.16b,  v3.16b
1360    st1        {v4.16b}, [x0], x1
1361    st1        {v5.16b}, [x0], x1
1362    b.gt        1b
1363
1364    ret
1365endfunc
1366