1;
2;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10;**************Variables Vs Registers*****************************************
11;    r0 => src
12;    r1 => dst
13;    r2 =>  src_stride
14;    r3 =>  dst_stride
15;    r4 => filter_x0
16;    r8 =>  ht
17;    r10 =>  wd
18
19    EXPORT          |vpx_convolve8_avg_horiz_filter_type1_neon|
20    ARM
21    REQUIRE8
22    PRESERVE8
23
24    AREA  ||.text||, CODE, READONLY, ALIGN=2
25
26|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
27
28    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
29                                                 ; the arguments
30    vpush           {d8  -  d15}                 ; stack offset by 64
31    mov             r4,     r1
32    mov             r1,     r2
33    mov             r2,     r4
34
35start_loop_count
36    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
37    ldr             r8,     [sp,    #108]   ;loads x0_q4
38    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
39    ldr             r8,     [sp,    #128]   ;loads ht
40    ldr             r10,    [sp,    #124]   ;loads wd
41    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
42    mov             r11,    #1
43    subs            r14,    r8,     #0      ;checks for ht == 0
44    vabs.s8         d2,     d0              ;vabs_s8(coeff)
45    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
46                                            ; 0)
47    sub             r12,    r0,     #3      ;pu1_src - 3
48    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
49                                            ; 1)
50    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
51    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
52                                            ; 2)
53    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
54    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
55                                            ; 3)
56    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
57    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
58                                            ; 4)
59    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
60                                            ; 5)
61    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
62                                            ; 6)
63    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
64                                            ; 7)
65    mov             r7,     r1
66    cmp             r10,    #4
67    ble             outer_loop_4
68
69    cmp             r10,    #24
70    moveq           r10,    #16
71    addeq           r8,     #8
72    addeq           r9,     #8
73    cmp             r10,    #16
74    bge             outer_loop_16
75
76    cmp             r10,    #12
77    addeq           r8,     #4
78    addeq           r9,     #4
79    b               outer_loop_8
80
81outer_loop8_residual
82    sub             r12,    r0,     #3      ;pu1_src - 3
83    mov             r1,     r7
84    mov             r14,    #32
85    add             r1,     #16
86    add             r12,    #16
87    mov             r10,    #8
88    add             r8,     #8
89    add             r9,     #8
90
91outer_loop_8
92    add             r6,     r1,     r3      ;pu1_dst + dst_strd
93    add             r4,     r12,    r2      ;pu1_src + src_strd
94    subs            r5,     r10,    #0      ;checks wd
95    ble             end_inner_loop_8
96
97inner_loop_8
98    mov             r7,     #0xc000
99    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
100    vdup.16         q4,     r7
101    vld1.u32        {d1},   [r12],  r11
102    vdup.16         q5,     r7
103    vld1.u32        {d2},   [r12],  r11
104    vld1.u32        {d3},   [r12],  r11
105    mov             r7,     #0x4000
106    vld1.u32        {d4},   [r12],  r11
107    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
108                                            ; coeffabs_1);
109    vld1.u32        {d5},   [r12],  r11
110    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
111                                            ; coeffabs_3);
112    vld1.u32        {d6},   [r12],  r11
113    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
114                                            ; coeffabs_0);
115    vld1.u32        {d7},   [r12],  r11
116    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
117                                            ; coeffabs_2);
118    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
119    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
120                                            ; coeffabs_4);
121    vld1.u32        {d13},  [r4],   r11
122    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
123                                            ; coeffabs_5);
124    vld1.u32        {d14},  [r4],   r11
125    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
126                                            ; coeffabs_6);
127    vld1.u32        {d15},  [r4],   r11
128    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
129                                            ; coeffabs_7);
130    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
131    vdup.16         q11,    r7
132    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
133                                            ; coeffabs_3);
134    vld1.u32        {d17},  [r4],   r11
135    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
136                                            ; coeffabs_2);
137    vhadd.s16       q4,     q4,     q11
138    vld1.u32        {d18},  [r4],   r11
139    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
140                                            ; coeffabs_4);
141    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
142    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
143                                            ; coeffabs_5);
144    vld1.u8         {d6},   [r1]
145    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
146                                            ; result 1
147    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
148                                            ; coeffabs_6);
149    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
150                                            ; coeffabs_7);
151    vld1.u8         {d7},   [r6]
152    vrhadd.u8       d20,    d20,    d6
153    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
154                                            ; coeffabs_0);
155    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
156                                            ; coeffabs_1);
157    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
158    vhadd.s16       q5,     q5,     q11
159    subs            r5,     r5,     #8      ;decrement the wd loop
160    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
161                                            ; result 2
162    vrhadd.u8       d8,     d8,     d7
163    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
164    cmp             r5,     #4
165    bgt             inner_loop_8
166
167end_inner_loop_8
168    subs            r14,    r14,    #2      ;decrement the ht loop
169    add             r12,    r12,    r9      ;increment the src pointer by
170                                            ; 2*src_strd-wd
171    add             r1,     r1,     r8      ;increment the dst pointer by
172                                            ; 2*dst_strd-wd
173    bgt             outer_loop_8
174
175    ldr             r10,    [sp,    #120]   ;loads wd
176    cmp             r10,    #12
177    beq             outer_loop4_residual
178
179end_loops
180    b               end_func
181
182outer_loop_16
183    str             r0,     [sp,  #-4]!
184    str             r7,     [sp,  #-4]!
185    add             r6,     r1,     r3      ;pu1_dst + dst_strd
186    add             r4,     r12,    r2      ;pu1_src + src_strd
187    and             r0,     r12,    #31
188    mov             r7,     #0xc000
189    sub             r5,     r10,    #0      ;checks wd
190    pld             [r4,    r2,     lsl #1]
191    pld             [r12,   r2,     lsl #1]
192    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
193    vdup.16         q4,     r7
194    vld1.u32        {q1},   [r12],  r11
195    vld1.u32        {q2},   [r12],  r11
196    vld1.u32        {q3},   [r12],  r11
197    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
198                                            ; coeffabs_0);
199    vld1.u32        {q6},   [r12],  r11
200    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
201                                            ; coeffabs_1);
202    vld1.u32        {q7},   [r12],  r11
203    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
204                                            ; coeffabs_2);
205    vld1.u32        {q8},   [r12],  r11
206    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
207                                            ; coeffabs_3);
208    vld1.u32        {q9},   [r12],  r11
209    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
210                                            ; coeffabs_4);
211    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
212                                            ; coeffabs_5);
213    vdup.16         q10,    r7
214    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
215                                            ; coeffabs_6);
216    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
217                                            ; coeffabs_7);
218
219inner_loop_16
220    vmlsl.u8        q10,    d1,     d24
221    vdup.16         q5,     r7
222    vmlsl.u8        q10,    d3,     d25
223    mov             r7,     #0x4000
224    vdup.16         q11,    r7
225    vmlal.u8        q10,    d5,     d26
226    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
227    vhadd.s16       q4,     q4,     q11
228    vld1.u32        {q1},   [r4],   r11
229    vmlal.u8        q10,    d7,     d27
230    add             r12,    #8
231    subs            r5,     r5,     #16
232    vmlal.u8        q10,    d13,    d28
233    vld1.u32        {q2},   [r4],   r11
234    vmlal.u8        q10,    d15,    d29
235    vld1.u32        {q3},   [r4],   r11
236    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
237                                            ; result 1
238    vmlsl.u8        q10,    d17,    d30
239    vld1.u32        {q6},   [r4],   r11
240    vmlsl.u8        q10,    d19,    d31
241    vld1.u32        {q7},   [r4],   r11
242    add             r7,     r1,     #8
243    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
244                                            ; coeffabs_0);
245    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
246                                            ; coeffabs_1);
247    vld1.u32        {q8},   [r4],   r11
248    vhadd.s16       q10,    q10,    q11
249    vld1.u32        {q9},   [r4],   r11
250    vld1.u8         {d0},   [r1]
251    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
252                                            ; coeffabs_2);
253    vld1.u8         {d2},   [r7]
254    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
255                                            ; coeffabs_3);
256    add             r4,     #8
257    mov             r7,     #0xc000
258    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
259                                            ; coeffabs_4);
260    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
261                                            ; coeffabs_5);
262    vqrshrun.s16    d9,     q10,    #6
263    vdup.16         q11,    r7
264    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
265                                            ; coeffabs_6);
266    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
267                                            ; coeffabs_7);
268    mov             r7,     #0x4000
269    vrhadd.u8       d8,     d8,     d0
270    vrhadd.u8       d9,     d9,     d2
271    vmlsl.u8        q11,    d1,     d24
272    vmlsl.u8        q11,    d3,     d25
273    vdup.16         q10,    r7
274    vmlal.u8        q11,    d5,     d26
275    pld             [r12,   r2,     lsl #2]
276    pld             [r4,    r2,     lsl #2]
277    addeq           r12,    r12,    r9      ;increment the src pointer by
278                                            ; 2*src_strd-wd
279    addeq           r4,     r12,    r2      ;pu1_src + src_strd
280    vmlal.u8        q11,    d7,     d27
281    vmlal.u8        q11,    d13,    d28
282    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
283    subeq           r14,    r14,    #2
284    vhadd.s16       q5,     q5,     q10
285    vmlal.u8        q11,    d15,    d29
286    addeq           r1,     r1,     r8
287    vmlsl.u8        q11,    d17,    d30
288    cmp             r14,    #0
289    vmlsl.u8        q11,    d19,    d31
290    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
291                                            ; result 2
292    beq             epilog_16
293
294    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
295    mov             r7,     #0xc000
296    cmp             r5,     #0
297    vld1.u32        {q1},   [r12],  r11
298    vhadd.s16       q11,    q11,    q10
299    vld1.u32        {q2},   [r12],  r11
300    vdup.16         q4,     r7
301    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
302                                            ; coeffabs_0);
303    vdup.16         q10,    r7
304    vld1.u32        {q3},   [r12],  r11
305    add             r7,     r6,     #8
306    moveq           r5,     r10
307    vld1.u8         {d0},   [r6]
308    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
309                                            ; coeffabs_1);
310    vld1.u8         {d2},   [r7]
311    vqrshrun.s16    d11,    q11,    #6
312    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
313                                            ; coeffabs_2);
314    vld1.u32        {q6},   [r12],  r11
315    vrhadd.u8       d10,    d10,    d0
316    vld1.u32        {q7},   [r12],  r11
317    vrhadd.u8       d11,    d11,    d2
318    vld1.u32        {q8},   [r12],  r11
319    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
320                                            ; coeffabs_3);
321    vld1.u32        {q9},   [r12],  r11
322    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
323                                            ; coeffabs_4);
324    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
325                                            ; coeffabs_5);
326    mov             r7,     #0xc000
327    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
328                                            ; coeffabs_6);
329    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
330    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
331                                            ; coeffabs_7);
332    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
333    b               inner_loop_16
334
335epilog_16
336    mov             r7,     #0x4000
337    ldr             r0,     [sp],   #4
338    ldr             r10,    [sp,    #120]
339    vdup.16         q10,    r7
340    vhadd.s16       q11,    q11,    q10
341    vqrshrun.s16    d11,    q11,    #6
342    add             r7,     r6,     #8
343    vld1.u8         {d20},  [r6]
344    vld1.u8         {d21},  [r7]
345    vrhadd.u8       d10,    d10,    d20
346    vrhadd.u8       d11,    d11,    d21
347    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
348    ldr             r7,     [sp],   #4
349    cmp             r10,    #24
350    beq             outer_loop8_residual
351
352end_loops1
353    b               end_func
354
355outer_loop4_residual
356    sub             r12,    r0,     #3      ;pu1_src - 3
357    mov             r1,     r7
358    add             r1,     #8
359    mov             r10,    #4
360    add             r12,    #8
361    mov             r14,    #16
362    add             r8,     #4
363    add             r9,     #4
364
365outer_loop_4
366    add             r6,     r1,     r3      ;pu1_dst + dst_strd
367    add             r4,     r12,    r2      ;pu1_src + src_strd
368    subs            r5,     r10,    #0      ;checks wd
369    ble             end_inner_loop_4
370
371inner_loop_4
372    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
373    vld1.u32        {d1},   [r12],  r11
374    vld1.u32        {d2},   [r12],  r11
375    vld1.u32        {d3},   [r12],  r11
376    vld1.u32        {d4},   [r12],  r11
377    vld1.u32        {d5},   [r12],  r11
378    vld1.u32        {d6},   [r12],  r11
379    vld1.u32        {d7},   [r12],  r11
380    sub             r12,    r12,    #4
381    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
382    vld1.u32        {d13},  [r4],   r11
383    vzip.32         d0,     d12             ;vector zip the i iteration and ii
384                                            ; interation in single register
385    vld1.u32        {d14},  [r4],   r11
386    vzip.32         d1,     d13
387    vld1.u32        {d15},  [r4],   r11
388    vzip.32         d2,     d14
389    vld1.u32        {d16},  [r4],   r11
390    vzip.32         d3,     d15
391    vld1.u32        {d17},  [r4],   r11
392    vzip.32         d4,     d16
393    vld1.u32        {d18},  [r4],   r11
394    vzip.32         d5,     d17
395    vld1.u32        {d19},  [r4],   r11
396    mov             r7,     #0xc000
397    vdup.16         q4,     r7
398    sub             r4,     r4,     #4
399    vzip.32         d6,     d18
400    vzip.32         d7,     d19
401    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
402                                            ; iteration in the same time
403    vmlsl.u8        q4,     d0,     d24
404    vmlal.u8        q4,     d2,     d26
405    vmlal.u8        q4,     d3,     d27
406    vmlal.u8        q4,     d4,     d28
407    vmlal.u8        q4,     d5,     d29
408    vmlsl.u8        q4,     d6,     d30
409    vmlsl.u8        q4,     d7,     d31
410    mov             r7,     #0x4000
411    vdup.16         q10,    r7
412    vhadd.s16       q4,     q4,     q10
413    vqrshrun.s16    d8,     q4,     #6
414    vld1.u32        {d10[0]},       [r1]
415    vld1.u32        {d10[1]},       [r6]
416    vrhadd.u8       d8,     d8,     d10
417    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
418                                            ; is in upper part of the register
419    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
420                                            ; is in lower part of the register
421    subs            r5,     r5,     #4      ;decrement the wd by 4
422    bgt             inner_loop_4
423
424end_inner_loop_4
425    subs            r14,    r14,    #2      ;decrement the ht by 4
426    add             r12,    r12,    r9      ;increment the input pointer
427                                            ; 2*src_strd-wd
428    add             r1,     r1,     r8      ;increment the output pointer
429                                            ; 2*dst_strd-wd
430    bgt             outer_loop_4
431
432end_func
433    vpop            {d8  -  d15}
434    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
435
436    ENDP
437
438    END
439